/* lco: a coroutine library for C that minimalises stack usage
   Copyright (C) 2018 Ariadne Devos

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/* This library allows you to use coroutines in C, without the possibility of
   stack overflow.

   Symbol interposition is allowed and indirect calls are not directly
   unsupported. You can, however, spin off a coroutine specially for the
   indirect call.
 */

struct lco_stackinfo;

#define LCO_STACK_SIZE_UNKNOWN -1

/* .local_size:
   Its scope depends somewhat on the ABI. It is a measure of how many stack
   bytes a function allocates by itself, plus or minus some small numbers.

   System V Application Binary Interface, AMD64 Architecture Processor
   Supplement: the return address, variables and possibly some of the redzone.

   .recursive_size:
   Like .local_size, but recursively adds stack usage by callees.
   This might not be set when the program has started, as we do not known the
   stack usage of external shared libraries aforehand, and because symbols
   may be interposed.

   If this has to be computed, it is LCO_STACK_SIZE_UNKNOWN.

   ---

   A word of warning: some linkers and object formats may prefer or require a
   data symbol to have the same size at runtime and compile-time. So do not
   expose this directly, but rather provide a pointer, as the number of callees
   may change, and so may the format.
*/
struct lco_stackinfo
{
	ssize_t local_size;
	_Atomic ssize_t recursive_size;
	/* NULL-terminated */
	struct lco_stackinfo *callees_si[];
};

#define LCO_LINK_TOODEEP 1
#define LCO_LINK_OVERFLOW 2

/* Compute unset missing stack usage information (.recursive_size) of each
   element of si, and possibly of their recursive callees. si counts n elements,
   stack counts n_stack elements.

   # Frame

   Construct S inductively as:
   Base: S(si[0 .. n - 1]).
   Callee: \forall struct lco_stackinfo *s, i : integer;
     S(i) -> 0 <= i < len(s->callees_si) -> S(s->callees_si[i]).

   Forall s : S:

   This function may read s.local_size and s.callees_si[0..len(s.callees_si) -
   1] without synchronisation or atomics. They may not change.

   s.recursive_size may be read and written with some unspecified memory
   ordering.

   stack[0..n_stack - 1] may be written without atomics or synchronisation.
   This function does not independ of old values of stack[0..n_stack - 1], if
   any.

   # Seperation

   stack[0..n_stack - 1], s and stack space allocated for this function
   are disjoint with each other.

   # Return value and effects

   If stack is exhausted (e.g. in case of a loop), return LCO_LINK_TOODEEP.
   If the stack usage of a function is greater than what a ssize_t can
   represent, return LCO_LINK_OVERFLOW.
   If none of the above, return zero.

   # Absence of arbitrary limitations

   This function may be called from signal handlers. It is reentrant, even from
   the same thread (although Frame and Seperation must still be followed.)

   # Relevant bugs and mitigations

   si and stack are not accessed out-of-bounds, even in the presence of Spectre.
   All indirect branches, if any, are retpolined.

   Even if s remains the same, the execution time and cache footprint depends
   upon the value of s.recursive_stack_size. This may release side-channel
   information about the callgraph, among others.

   For varying combinations of s, *s, and stack[0..n_stack], the execution time
   may vary because of branches and the number of iterations, the cache
   footprint may vary too, and so may branch prediction information.

   (Summary: don't hide the value of *s, except perhaps for the pointers'
   addresses, call this from initialisation code, and don't call initialisation
   code if there is sensitive information that might be leaked.)
*/
int
lco_si_link_batch(size_t n, struct lco_stackinfo *const *si, size_t n_stack, void **stack);

struct lco_position
{
        /* Internal; the stack pointer is saved here. */
	void *_sp;
};

struct lco_coroutine;

typedef void lco_coroutine_entry(struct lco_coroutine *);

/* Do not reorder this structure! Assembly code depends on its layout. */
struct lco_coroutine
{
	/* The coroutine starts with this function. */
	lco_coroutine_entry *entry;
	/* The size stack_array must have. */
	size_t stack_size;
	/* May be used by entry. */
	void *data_for_entry;
	char *stack_array;
	struct lco_position pos;
	void *_padding[3];
};

/* You cannot directly define a struct lco_coroutine, because you cannot
   know the required stack size within C. You need to use a tool for this.

   lco-stackbounds a.o b.o ... | lco-coroutine --init coroutines_init --extern lcl_puts input=f parse=g ... - > coroutines.c

   Declaration must be of form struct lco_coroutine input.

   Often, there is a dependency on an external library, whose stack bounds may
   change over time. If so, an argument --init f is required, which will create
   a function to set the stack bounds to agree with the library. This f must
   be called *after* the library's initialisation code has been called.

   Functions defined elsewhere, e.g. in an external library, must be declared
   with --extern f.

   To make the stack bounds of a function available for initialisation
   functions, e.g. in case of a shared library, add an argument --export f.

   --export f will create a symbol size_t f_SB.
   --extern f may create a reference to size_t f_SB.
*/

#if defined(__x86_64)
# define LCO_STACK_ALIGNMENT 16
#else
# error Do not know the stack aligment for your architecture!
#endif

/* An unprepared coroutine without a stack may be copied with memcpy. */

/* Allocate a stack for co. There must be no stack when it is called.
   Returns co if successful, 0 in case of a memory allocation failure.
   To deallocate, pair with lco_free_stack(co).

   You may also use your own allocation functions.
 */
_Bool
lco_alloc_stack(struct lco_coroutine *co);

/* Free the stack of co. co must be inactive and will need a new stack.
   The stack must have been allocated with lco_alloc_stack(co).
   Double frees are not allowed. The stack is set to NULL.

   You may also use your own allocation functions.
*/
void
lco_free_stack(struct lco_coroutine *co);

/* Prepare co for starting for the first time. It must have a stack. */
void
lco_init(struct lco_coroutine *co);

/* The OS might not like coroutines migrating from one thread to another.
   Even if it is allowed, memory consistency has not been though out.

   Summary: multithreading is allowed, but keep coroutines local to a thread,
   or make changes to this library (documentation and code) such that it is
   well-defined (patches are welcome).
 */

/* By default, the OS (at least, the Linux kernel) lets the application handle
   signals on the current stack. That is incompatible with the limited stack
   size of lco coroutines. You need to set an alternate signal stack if signals
   might be delivered.

   You can use coroutines from within the signal handler, but these must be
   local to the current invocation of the signal handler, as if it were a
   thread.

   This function configures a signal stack, if applicable, for the current
   thread. You can also use sigaltstack.

   Array must be aligned to LCO_STACK_ALIGNMENT.
*/
void
lco_auto_sigaltstack(size_t size, char *array);

/* If all you need is setting some flags or writing a few bytes to a pipe,
   this should be plenty. This constant will not decrease.

   TODO: figure out how to automate selecting a minimal sigalt stack size.
   Might require working around the C library.
*/
#define LCO_PROPOSED_SIGALTSTACK 4096

/* co must be inactive. Let it start, and pause the current coroutine.
   co->position is modified.
   current->position is modified.

   flags must be zero.
 */
void
lco_continue(struct lco_coroutine *co, struct lco_coroutine *current, unsigned long flags);

/* co must be inactive. Let it start within the current OS thread.
   co->position is modified.
   *thread is modified.

   The coroutine can return to this function with lco_pause(co, thread, 0), but
   it doesn't have to.

   flags must be zero.
 */
void
lco_resume(struct lco_coroutine *co, struct lco_position *thread, unsigned long flags);

/* Pause current and exit the call to lco_resume(current, thread, 0).

   current->position is modified.
   *thread is modified.

   flags must be zero.
 */
void
lco_pause(struct lco_coroutine *current, struct lco_position *thread, unsigned long flags);

/* Overview:
   active a, inactive b -> inactive a, active b: lco_continue(a, b, 0).
   inactive co, active t -> active co, inactive t: lco_resume(co, t, 0).
   active co, inactive t -> inactive co, active t: lco_pause(co, t, 0).
 */
